Initialisation et import des données
install.packages("tidyverse")
Error in install.packages : Updating loaded packages
install.packages("pacman")
Error in install.packages : Updating loaded packages
install.packages("dplyr")
Error in install.packages : Updating loaded packages
library(tidyverse)
fig <- function(width, heigth){
options(repr.plot.width = width, repr.plot.height = heigth)
}
library(pacman)
pacman::p_load(pacman,dplyr, ggplot2, rio, gridExtra, scales, ggcorrplot, caret, e1071)
dataSet <- read.csv('data/train.csv')
names(dataSet)
[1] "battery_power" "blue" "clock_speed" "dual_sim" "fc" "four_g" "int_memory" "m_dep" "mobile_wt" "n_cores" "pc"
[12] "px_height" "px_width" "ram" "sc_h" "sc_w" "talk_time" "three_g" "touch_screen" "wifi" "price_range"
Affichage du set de données.
Colonnes : battery_power, blue, clock_speed, dual_sim, fc, four_g,int_memory, m_dep, mobile_wt, n_cores, pc, px_height, px_width, ram, sc_h, sc_w, talk_time, three_g, touch_screen, wifi, price_range
battery_power:Total energy a battery can store in one time measured in mAh blue:Has bluetooth or not clock_speed:speed at which microprocessor executes instructions dual_sim:Has dual sim support or not fc:Front Camera mega pixels four_g:Has 4G or not int_memory:Internal Memory in Gigabytes m_dep:Mobile Depth in cm mobile_wt:Weight of mobile phone n_cores:Number of cores of processor pc:Primary Camera mega pixels px_height:Pixel Resolution Height px_width:Pixel Resolution Width ram:Random Access Memory in Megabytes sc_h:Screen Height of mobile in cm sc_w:Screen Width of mobile in cm talk_time:longest time that a single battery charge will last when you are three_g:Has 3G or not touch_screen:Has touch screen or not wifi:Has wifi or not price_range: This is the target variable with value of 0(low cost), 1(medium cost), 2(high cost) and 3(very high cost).
dim(dataSet)
[1] 2000 21
class(dataSet)
[1] "data.frame"
head(dataSet)
sapply(dataSet, class)
battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt n_cores pc px_height px_width
"integer" "integer" "numeric" "integer" "integer" "integer" "integer" "numeric" "integer" "integer" "integer" "integer" "integer"
ram sc_h sc_w talk_time three_g touch_screen wifi price_range
"integer" "integer" "integer" "integer" "integer" "integer" "integer" "integer"
summary(dataSet)
battery_power blue clock_speed dual_sim fc four_g int_memory m_dep mobile_wt n_cores pc
Min. : 501.0 Min. :0.000 Min. :0.500 Min. :0.0000 Min. : 0.000 Min. :0.0000 Min. : 2.00 Min. :0.1000 Min. : 80.0 Min. :1.000 Min. : 0.000
1st Qu.: 851.8 1st Qu.:0.000 1st Qu.:0.700 1st Qu.:0.0000 1st Qu.: 1.000 1st Qu.:0.0000 1st Qu.:16.00 1st Qu.:0.2000 1st Qu.:109.0 1st Qu.:3.000 1st Qu.: 5.000
Median :1226.0 Median :0.000 Median :1.500 Median :1.0000 Median : 3.000 Median :1.0000 Median :32.00 Median :0.5000 Median :141.0 Median :4.000 Median :10.000
Mean :1238.5 Mean :0.495 Mean :1.522 Mean :0.5095 Mean : 4.309 Mean :0.5215 Mean :32.05 Mean :0.5018 Mean :140.2 Mean :4.521 Mean : 9.916
3rd Qu.:1615.2 3rd Qu.:1.000 3rd Qu.:2.200 3rd Qu.:1.0000 3rd Qu.: 7.000 3rd Qu.:1.0000 3rd Qu.:48.00 3rd Qu.:0.8000 3rd Qu.:170.0 3rd Qu.:7.000 3rd Qu.:15.000
Max. :1998.0 Max. :1.000 Max. :3.000 Max. :1.0000 Max. :19.000 Max. :1.0000 Max. :64.00 Max. :1.0000 Max. :200.0 Max. :8.000 Max. :20.000
px_height px_width ram sc_h sc_w talk_time three_g touch_screen wifi price_range
Min. : 0.0 Min. : 500.0 Min. : 256 Min. : 5.00 Min. : 0.000 Min. : 2.00 Min. :0.0000 Min. :0.000 Min. :0.000 Min. :0.00
1st Qu.: 282.8 1st Qu.: 874.8 1st Qu.:1208 1st Qu.: 9.00 1st Qu.: 2.000 1st Qu.: 6.00 1st Qu.:1.0000 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.75
Median : 564.0 Median :1247.0 Median :2146 Median :12.00 Median : 5.000 Median :11.00 Median :1.0000 Median :1.000 Median :1.000 Median :1.50
Mean : 645.1 Mean :1251.5 Mean :2124 Mean :12.31 Mean : 5.767 Mean :11.01 Mean :0.7615 Mean :0.503 Mean :0.507 Mean :1.50
3rd Qu.: 947.2 3rd Qu.:1633.0 3rd Qu.:3064 3rd Qu.:16.00 3rd Qu.: 9.000 3rd Qu.:16.00 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:2.25
Max. :1960.0 Max. :1998.0 Max. :3998 Max. :19.00 Max. :18.000 Max. :20.00 Max. :1.0000 Max. :1.000 Max. :1.000 Max. :3.00
library(ggcorrplot)
corr <- round(cor(dataSet), 8)
ggcorrplot(corr)
fig(18, 16)
str(dataSet)
'data.frame': 2000 obs. of 21 variables:
$ battery_power: int 842 1021 563 615 1821 1859 1821 1954 1445 509 ...
$ blue : int 0 1 1 1 1 0 0 0 1 1 ...
$ clock_speed : num 2.2 0.5 0.5 2.5 1.2 0.5 1.7 0.5 0.5 0.6 ...
$ dual_sim : int 0 1 1 0 0 1 0 1 0 1 ...
$ fc : int 1 0 2 0 13 3 4 0 0 2 ...
$ four_g : int 0 1 1 0 1 0 1 0 0 1 ...
$ int_memory : int 7 53 41 10 44 22 10 24 53 9 ...
$ m_dep : num 0.6 0.7 0.9 0.8 0.6 0.7 0.8 0.8 0.7 0.1 ...
$ mobile_wt : int 188 136 145 131 141 164 139 187 174 93 ...
$ n_cores : int 2 3 5 6 2 1 8 4 7 5 ...
$ pc : int 2 6 6 9 14 7 10 0 14 15 ...
$ px_height : int 20 905 1263 1216 1208 1004 381 512 386 1137 ...
$ px_width : int 756 1988 1716 1786 1212 1654 1018 1149 836 1224 ...
$ ram : int 2549 2631 2603 2769 1411 1067 3220 700 1099 513 ...
$ sc_h : int 9 17 11 16 8 17 13 16 17 19 ...
$ sc_w : int 7 3 2 8 2 1 8 3 1 10 ...
$ talk_time : int 19 7 9 11 15 10 18 5 20 12 ...
$ three_g : int 0 1 1 1 1 1 1 1 1 1 ...
$ touch_screen : int 0 1 1 0 1 0 0 1 0 0 ...
$ wifi : int 1 0 0 0 0 0 1 1 0 0 ...
$ price_range : int 1 2 2 2 1 1 3 0 0 0 ...
prop.table(table(dataSet$blue)) # cell percentages
0 1
0.505 0.495
prop.table(table(dataSet$dual_sim)) # cell percentages
0 1
0.4905 0.5095
prop.table(table(dataSet$four_g)) # cell percentages
0 1
0.4785 0.5215
prop.table(table(dataSet$three_g)) # cell percentages
0 1
0.2385 0.7615
prop.table(table(dataSet$touch_screen)) # cell percentages
0 1
0.497 0.503
prop.table(table(dataSet$wifi)) # cell percentages
0 1
0.493 0.507
Subplots using filtered dataset
library(ggplot2)
data = data.frame(Dimensions_in_cm = c(dataSet$sc_h, dataSet$sc_w),
Screen = rep(c("Height", "Width"), c(length(dataSet$sc_h), length(dataSet$sc_w))))
ggplot(data, aes(Dimensions_in_cm, fill = Screen)) +
geom_bar(position = 'identity', alpha = .6)
library(ggplot2)
library(gridExtra)
p1 <- ggplot(dataSet, aes(x=px_width, y = px_height, color=price_range)) +
geom_boxplot(outlier.colour="red", outlier.shape=8,
outlier.size=4) +
labs(title = "Pixel Resolution Height vs Pixel Resolution Width")
p2 <- ggplot(dataSet, aes(x=price_range, y = ram, color=price_range)) +
geom_boxplot(outlier.colour="red", outlier.shape=8,
outlier.size=4) +
labs(title = "RAM vs Price Range")
grid.arrange(p1, p2,nrow = 1)
fig(24, 20)
library(ggplot2)
library(gridExtra)
dataSet$price_range <- as.factor(dataSet$price_range)
p3 <- ggplot(dataSet, aes(x=price_range, y = int_memory, color=price_range)) +
geom_boxplot(outlier.colour="red", outlier.shape=8,
outlier.size=4) +
labs(title = "int_memory vs Price Range")
p4 <- ggplot(dataSet, aes(x=price_range, y = battery_power, color=price_range)) +
geom_boxplot(outlier.colour="red", outlier.shape=8,
outlier.size=4) +
labs(title = "Battery power vs Price Range")
grid.arrange(p3, p4,nrow = 1)
fig(24, 20)
library(ggplot2)
p <- ggplot(dataSet, aes(battery_power, ram, color = price_range))+
geom_point()
p + stat_ellipse()
p <- ggplot(dataSet, aes(int_memory, ram, color = price_range))+
geom_point()
p + stat_ellipse()
p <- ggplot(dataSet, aes(x = ram, y = ram, color = price_range))+
geom_point()
p + stat_ellipse()
library(tidyverse)
library(plotly)
# Créer le graphique
p <- plot_ly(
dataSet, x = dataSet$battery_power, y = dataSet$ram, z = dataSet$int_memory,
color = dataSet$price_range) %>%
add_markers(size=1) %>%
layout(
scene = list(xaxis = list(title = 'Battery Power'),
yaxis = list(title = 'Ram'),
zaxis = list(title = 'Memoire interne'))
)
p